# 1. load Libraries
library(sf)
library(tidyverse)
# install.packages('mapview')
library(mapview)
library(spdep)
library(caret)
library(ckanr) # for opening data APIs built on CKAN technology
library(FNN)
library(grid)
library(gridExtra)
library(ggcorrplot)
library(jtools) # for regression model plots
library(stargazer) # for creating table
library(broom)
library(tufte)
library(rmarkdown)
library(kableExtra)
library(tidycensus)
# 2. Identify functions
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 15,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.text.x = element_text(size = 14))
}
plotTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(color = "darkred", size=15, face="bold"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
#3. Load Quantile break functions
qBr <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]],0),
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]]), digits = 3),
c(.01,.2,.4,.6,.8), na.rm=T)
}
}
q5 <- function(variable) {as.factor(ntile(variable, 5))}
# Load hexadecimal color palette
palette <- c('#feedde', '#fdbe85', '#fd8d3c', '#e6550d', '#a63603')
# for calculating average nearest neighbor distance.
nn_function <- function(measureFrom,measureTo,k) {
measureFrom_Matrix <- as.matrix(measureFrom)
measureTo_Matrix <- as.matrix(measureTo)
nn <-
get.knnx(measureTo, measureFrom, k)$nn.dist
output <-
as.data.frame(nn) %>%
rownames_to_column(var = "thisPoint") %>%
gather(points, point_distance, V1:ncol(.)) %>%
arrange(as.numeric(thisPoint)) %>%
group_by(thisPoint) %>%
dplyr::summarize(pointDistance = mean(point_distance)) %>%
arrange(as.numeric(thisPoint)) %>%
dplyr::select(-thisPoint) %>%
pull() # pull() is similar to $. It's mostly useful because it looks a little nicer in pipes, it also works with remote data frames, and it can optionally name the output.
return(output)
}
In this section, we loaded necessary libraries, created plot theme options and map theme options, and identified functions of quintile breaks, and average nearest neighbor distance for further analysis.
In this section, we loaded Miami data and other complementary datasets, conducted feature selection and engineering. Besides, for other analysis, we separate the whole dataset into miami.training and miami.test in advance.
Five categorical datasets were used in this project:
Miami House Price and Internal Characteristics: basic geometric dataset provided by the course in advance.
Miami Base Data: geometric data containing the geographic information of Miami and Miami Beach.
Miami Neighborhood Shapefiles: digital geographic dataset contains polygon features representing neighborhood areas within the City of Miami as well as Miami Beach, Florida.
Census Data: demographic variables from the ACS 2017 for census tracts in Miami-Dade County. We specifically selected 8 variables in the analysis:
TotalPop: Total population in each census tractWhites: White population in each census tractMedHHInc: Median household income in each census tractMedRent: Median Rent for properties in each census tractTotalPoverty: Population living under the level of poverty in each census tractTotalVacant: Total vacant unit in each census tractTotalUnit: Total housing units in each census tractRenterOccpied: Total householder lived in renter-occupied housing unitsWith variables above, we created the four features to be used in the model building.
pctWhite: white population proportion in each census tract
pctPoverty: Poverty population proportion in each census tract
pctTotalVacant: Vacant unites proportion in each census tract
pctRenterOccupied: Proportion of householder living in renter-occupied housing units in each census tract
Notice: Those vacant related variables are vital features we explored that would significantly improve our model and powerfully predict price. Detailed information will be provided in the following analysis.
Amenity Data: compilatory data chosen for describing amenities and public service of Miami-Dade County. The datasets we chose encompasses crime rate, education opportunity, healthcare service and entertainment accessibility. See each dataset below:
Sexual Predator: A point feature class of Sexual Offenders and Predators within Miami-Dade County that is used by the Sexual Offender/Predator Residence Search Internet application.
Public School: A point feature class of the Miami-Dade County Public Schools facilities.
College, University: A point feature class of colleges and universities within Miami-Dade County.
Hospital: A point feature class of the Hospital facilities within Miami-Dade County.
Landmark: A point feature class of landmarks within Miami-Dade County.
Hotel, Motel, Inn: A point feature class of Hotel, Motel and Inn facilities within Miami-Dade County.
Shopping Mall: A point feature class that contains the locations of major shopping malls within Miami-Dade County. Shopping Mall is considered a large retail complex containing a variety of stores and often restaurants in which one or more buildings form a complex of shops representing merchandisers with interconnecting walkways that enable customers to walk from unit to unit.
# I. Base data
## House Price & internal characteristics
# miami.sf <- st_read('C:/Users/Yiming Ma/Documents/UPenn/MUSA508/hw/hw2/studentsData.geojson')
miami.sf <- st_read('/Users/penguin/Box Sync/GitHub/MUSA508-Midterm/studentsData.geojson')
## Miami base data
miami.base <-st_read("https://opendata.arcgis.com/datasets/5ece0745e24b4617a49f2e098df8117f_0.geojson") %>%
filter(NAME %in% c("MIAMI", "MIAMI BEACH"))
## Neighborhood data
miami.neigh <- st_read('https://opendata.arcgis.com/datasets/2f54a0cbd67046f2bd100fb735176e6c_0.geojson')
# plot(miami.neigh)
## Miami beach neighborhood data
miami.beach <- st_read('https://opendata.arcgis.com/datasets/5ece0745e24b4617a49f2e098df8117f_0.geojson') %>%
filter(NAME == "MIAMI BEACH")
# Combine Miami beach neighborhood data and Miami city neighborhood data together
index <- miami.sf %>%
rowid_to_column(var = "rowIndex")
index.neigh <-st_join(index,miami.neigh) %>%
distinct(rowIndex, .keep_all = TRUE) %>%
select(-Shape_STAr,-Shape_STLe,-Shape__Area,-Shape__Length)
index.beach <- st_join(index,miami.beach) %>%
filter(NAME == "MIAMI BEACH") %>%
distinct(rowIndex, .keep_all = TRUE) %>%
rename(LABEL = NAME) %>%
select(-MUNICID,-MUNICUID,-CREATEDBY,-CREATEDDATE,-MODIFIEDBY,
-MODIFIEDDATE,-SHAPE_Length,-SHAPE_Area,-FIPSCODE)
### Create final data we will work on!
miami.sf <- rbind(index.neigh, index.beach) %>%
distinct(rowIndex, .keep_all = TRUE)
### check neighborhood information
miami.neigh.num <- as.data.frame(table(miami.sf$LABEL))
## Census Tract data
## View(load_variables(2017,'acs5',cache = TRUE))
tracts17 <-
get_acs(geography = "tract", variables = c("B25026_001E","B02001_002E",
"B19013_001E","B25058_001E",
"B06012_002E",
# vacant variables
"B25002_003E",
"B25004_002E","B25004_003E",
"B25004_004E","B25004_005E",
# total housing unit
"B25001_001E",
# renter occupied
'B08137_003E'),
year=2017, state= 12, county= 086, geometry=T, output="wide") %>%
st_transform('EPSG:2236') %>%
rename(TotalPop = B25026_001E,
Whites = B02001_002E,
MedHHInc = B19013_001E,
MedRent = B25058_001E,
TotalPoverty = B06012_002E,
TotalVacant = B25002_003E,
ForRent = B25004_002E,
ForRentVac = B25004_003E,
ForSale = B25004_004E,
ForSaleVac = B25004_005E,
TotalUnit = B25001_001E,
RenterOccupied = B08137_003E
) %>%
dplyr::select(-NAME, -starts_with("B")) %>% #-starts_with("B") awesome!
mutate(pctWhite = ifelse(TotalPop > 0, Whites / TotalPop,0),
pctPoverty = ifelse(TotalPop > 0, TotalPoverty / TotalPop, 0),
pctTotalVacant = ifelse(TotalUnit > 0, TotalVacant / TotalUnit * 100, 0),
TotalOccupied = TotalUnit - TotalVacant,
pctRenterOccupied = ifelse(TotalOccupied >0, RenterOccupied/TotalOccupied, 0)) %>%
dplyr::select(-Whites, -TotalPoverty)
projected.tracts17 <-
tracts17 %>%
st_transform(st_crs(miami.sf))
miami.sf <-st_join(miami.sf, projected.tracts17) %>%
distinct(rowIndex, .keep_all = TRUE)
## Amenity Data
## 1. Miami landmark
miami.landmark <- st_read('https://opendata.arcgis.com/datasets/70a14825e66f4f0eb28d2a9cceba1761_0.geojson')
miami.landmark.sf <- miami.landmark %>%
select(geometry) %>%
na.omit() %>%
distinct()
## 2. Miami Shopping Mall
miami.mall <- st_read('https://opendata.arcgis.com/datasets/cb24d578246647a9a4c57bbd80c1caa8_0.geojson')
miami.mall.sf <- miami.mall %>%
select(geometry) %>%
na.omit() %>%
distinct()
## 3. Miami Sexual Predator
miami.sexual <- st_read('https://opendata.arcgis.com/datasets/f8759d722aeb4198bfe7c4ad780604d2_0.geojson')
miami.sexual.sf <- miami.sexual %>%
select(geometry) %>%
na.omit() %>%
distinct()
## 4. School data
miami.school <- st_read('https://opendata.arcgis.com/datasets/d3db0fce650d4e40a5949b0acae6fe3a_0.geojson')
miami.school.sf <- miami.school %>%
select(geometry) %>%
na.omit() %>%
distinct()
## 5. Hotel data
miami.hotel <- st_read('https://opendata.arcgis.com/datasets/d37bbc15e7304b4ca4607783283147b7_0.geojson')
miami.hotel.sf <- miami.hotel%>%
select(geometry) %>%
na.omit() %>%
distinct()
## 6. College data
miami.college <- st_read('https://opendata.arcgis.com/datasets/7db056c406b943dc8f3f377b99d77588_0.geojson')
miami.college.sf <- miami.college%>%
select(geometry) %>%
na.omit() %>%
distinct()
## 7. Hospital data
miami.hospital <- st_read('https://opendata.arcgis.com/datasets/0067a0e8b40644f980afa23ad34c32c4_0.geojson')
miami.hospital.sf <- miami.hospital %>%
select(geometry) %>%
na.omit() %>%
distinct()
In this section, we created some new features to describe the amenities and public services in Miami-Dade County, as well as to each home sale observation’s internal characteristics.
First of all, we were aimed at creating features to measure the extent to which buyers can assess multiple public services, including hospitals, shopping malls, landmarks, hotels, motels and inns, and education resources. To utmost avoid scale bias triggered by setting arbitrary areal unit or fixed buffer distance of each home sale observation, we chose to calculate the average nearest neighbor distance from each home sale to its k nearest neighbor public services. Accordingly, the smaller the value is, the more possibility that the observation can access specific service. To better compare each feature’s impact, we set the k from 3 to 5.
By leveraging the exactly same approach, we also created a new feature to measure the extent to which buyers can expose sexual assaults.
Secondly, we added four more features to indicate the duration each house was built, effectively used, and if the house was built with a pool or a patio.
## 1. Landmark
miami.sf <-
miami.sf %>%
mutate(
landmark_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.landmark.sf)), 1),
landmark_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.landmark.sf)), 2),
landmark_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.landmark.sf)), 3),
landmark_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.landmark.sf)), 4),
landmark_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.landmark.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 2. Shopping mall
miami.sf <-
miami.sf %>%
mutate(
mall_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.mall.sf)), 1),
mall_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.mall.sf)), 2),
mall_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.mall.sf)), 3),
mall_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.mall.sf)), 4),
mall_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.mall.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 3. Sexual Assaults
miami.sf <-
miami.sf %>%
mutate(
sexual_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.sexual.sf)), 1),
sexual_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.sexual.sf)), 2),
sexual_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.sexual.sf)), 3),
sexual_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.sexual.sf)), 4),
sexual_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.sexual.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 4. Hotel Access
miami.sf <-
miami.sf %>%
mutate(
hotel_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hotel.sf)), 1),
hotel_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hotel.sf)), 2),
hotel_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hotel.sf)), 3),
hotel_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hotel.sf)), 4),
hotel_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hotel.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 5. School Access
miami.sf <-
miami.sf %>%
mutate(
school_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.school.sf)), 1),
school_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.school.sf)), 2),
school_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.school.sf)), 3),
school_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.school.sf)), 4),
school_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.school.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 6. College Access
miami.sf <-
miami.sf %>%
mutate(
college_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.college.sf)), 1),
college_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.college.sf)), 2),
college_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.college.sf)), 3),
college_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.college.sf)), 4),
college_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.college.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 7. Hospital Access
miami.sf <-
miami.sf %>%
mutate(
hospital_nn1 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hospital.sf)), 1),
hospital_nn2 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hospital.sf)), 2),
hospital_nn3 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hospital.sf)), 3),
hospital_nn4 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hospital.sf)), 4),
hospital_nn5 = nn_function(st_coordinates(st_centroid(miami.sf)), st_coordinates(st_centroid(miami.hospital.sf)), 5)) %>%
distinct(rowIndex, .keep_all = TRUE)
## 8. Internal characteristics
miami.sf <-
miami.sf %>%
mutate(pool_house = ifelse(str_detect(XF1, "Pool"), 1, 0),
patio_house = ifelse(str_detect(XF1, "Patio"), 1, 0),
Age = 2020- YearBuilt,
Age.effective = 2020- EffectiveYearBuilt)
As indicated earlier, we separated the whole dataset (3503 observations in total) into miami.training and miami.test in advance for model building in next section. So far, we have 2,627 home sale observations for working on, while we used the left 876 observations to test model for competition.
# Split dataset
miami.training <- miami.sf %>%
filter(toPredict == '0')
# summary(miami.training)
miami.test <- miami.sf %>%
filter(toPredict == '1')
# summary(miami.test)
As required, we provided three tables of summary statistics with variable descriptions for sorting features based on their categories below.
In total, we took 62 features into consideration in the initial stage.
19 features (including SalePrice came from the category “internal characteristics”. 42 features (including census data) came from the category “amenity/public service”. We regard census data as indicators to show the house environment. For spatial features, we decided to continue to stick on neighborhood features, and summarize house number in each neighborhood for better understanding.
Detailed summary statistics are provided below.
## All potential features
all.feature.list <- miami.training %>%
dplyr:: select(-saleDate,-saleType,-saleQual,-saleYear,-Property.Address,
-Year,-WVDB,-HEX,-GPAR,-County.2nd.HEX,-County.Senior,-County.LongTermSenior,
-County.Other.Exempt,-City.2nd.HEX,-City.Senior,-City.LongTermSenior,
-City.Other.Exempt,-MillCode,-Land.Use,-Owner1,-Owner2,-Mailing.Address,
-Mailing.City,-Mailing.State,-Mailing.Zip,-Mailing.Country,
-starts_with("Legal"), -YearBuilt, -EffectiveYearBuilt, -X, -FID, -starts_with("Shape_"), -Folio,
-XF1, -XF2, -XF3) %>%
st_drop_geometry()
## 2.1 Feature: Internal Characteristics
internal.feature.list <- all.feature.list %>%
dplyr:: select(SalePrice, Land, Bldg, Total, Assessed,County.Taxable,City.Taxable,AdjustedSqFt,
LotSize, Bed, Bath, Stories, Units, LivingSqFt, ActualSqFt, Age,Age.effective,
pool_house, patio_house)
stargazer(internal.feature.list, type = "html",
title = "Table DATA 2.1 Summary Statistics of Internal Characteristics ",
header = FALSE,
single.row = TRUE)
| Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
| SalePrice | 2,627 | 974,540.600 | 2,047,999.000 | 12,500 | 299,000 | 826,250 | 27,750,000 |
| Land | 2,627 | 485,089.200 | 985,245.200 | 27,050 | 134,957 | 457,257.5 | 20,972,640 |
| Bldg | 2,627 | 289,575.800 | 838,520.000 | 1,000 | 79,941 | 203,242 | 15,175,769 |
| Total | 2,627 | 774,665.000 | 1,627,564.000 | 55,159 | 233,916 | 687,787 | 21,974,519 |
| Assessed | 2,627 | 737,515.200 | 1,600,859.000 | 29,137 | 217,670.5 | 628,332.5 | 21,974,519 |
| County.Taxable | 2,627 | 713,294.700 | 1,603,350.000 | 0 | 192,542.5 | 607,352 | 21,974,519 |
| City.Taxable | 2,627 | 713,294.700 | 1,603,350.000 | 0 | 192,542.5 | 607,352 | 21,974,519 |
| AdjustedSqFt | 2,627 | 2,044.029 | 1,492.860 | 331 | 1,248 | 2,251 | 18,006 |
| LotSize | 2,627 | 7,681.807 | 4,748.604 | 1,250 | 5,400 | 7,931.2 | 80,664 |
| Bed | 2,627 | 2.992 | 1.100 | 0 | 2 | 3 | 13 |
| Bath | 2,627 | 2.054 | 1.316 | 0 | 1 | 2 | 12 |
| Stories | 2,627 | 1.192 | 0.432 | 0 | 1 | 1 | 4 |
| Units | 2,627 | 1.006 | 0.078 | 1 | 1 | 1 | 2 |
| LivingSqFt | 2,627 | 1,971.459 | 1,429.946 | 288 | 1,194.5 | 2,215 | 18,006 |
| ActualSqFt | 2,627 | 2,324.683 | 1,809.884 | 388 | 1,383 | 2,544.5 | 20,192 |
| Age | 2,627 | 68.772 | 22.720 | 1 | 67 | 82 | 115 |
| Age.effective | 2,627 | 47.199 | 26.669 | 1 | 24 | 71 | 115 |
| pool_house | 2,627 | 0.255 | 0.436 | 0 | 0 | 1 | 1 |
| patio_house | 2,627 | 0.178 | 0.383 | 0 | 0 | 0 | 1 |
## 2.2 Feature: Amenities/Public Services
amenity.feature.list <- all.feature.list %>%
dplyr:: select(starts_with('landmark_'),starts_with('mall_'),starts_with('hospital_'),
starts_with('college_'),starts_with('school_'),starts_with('hotel_'),
pctTotalVacant, pctRenterOccupied, pctWhite,pctPoverty, MedRent ,MedHHInc)
stargazer(amenity.feature.list, type = "html",
title = "Table DATA 2.2 Summary Statistics of Amenities/Public Services ",
header = FALSE,
single.row = TRUE)
| Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
| landmark_nn1 | 2,627 | 0.015 | 0.009 | 0.001 | 0.009 | 0.019 | 0.052 |
| landmark_nn2 | 2,627 | 0.019 | 0.011 | 0.002 | 0.012 | 0.023 | 0.053 |
| landmark_nn3 | 2,627 | 0.022 | 0.012 | 0.003 | 0.014 | 0.026 | 0.058 |
| landmark_nn4 | 2,627 | 0.024 | 0.012 | 0.004 | 0.016 | 0.027 | 0.062 |
| landmark_nn5 | 2,627 | 0.025 | 0.012 | 0.006 | 0.017 | 0.029 | 0.065 |
| mall_nn1 | 2,627 | 0.023 | 0.010 | 0.001 | 0.015 | 0.031 | 0.051 |
| mall_nn2 | 2,627 | 0.031 | 0.012 | 0.009 | 0.022 | 0.039 | 0.058 |
| mall_nn3 | 2,627 | 0.037 | 0.012 | 0.012 | 0.029 | 0.046 | 0.065 |
| mall_nn4 | 2,627 | 0.043 | 0.012 | 0.020 | 0.033 | 0.052 | 0.071 |
| mall_nn5 | 2,627 | 0.047 | 0.012 | 0.028 | 0.036 | 0.057 | 0.077 |
| hospital_nn1 | 2,627 | 0.019 | 0.010 | 0.001 | 0.012 | 0.024 | 0.056 |
| hospital_nn2 | 2,627 | 0.025 | 0.011 | 0.002 | 0.018 | 0.030 | 0.069 |
| hospital_nn3 | 2,627 | 0.029 | 0.013 | 0.005 | 0.020 | 0.034 | 0.076 |
| hospital_nn4 | 2,627 | 0.033 | 0.014 | 0.005 | 0.023 | 0.037 | 0.083 |
| hospital_nn5 | 2,627 | 0.036 | 0.015 | 0.006 | 0.026 | 0.038 | 0.089 |
| college_nn1 | 2,627 | 0.013 | 0.008 | 0.0004 | 0.007 | 0.016 | 0.053 |
| college_nn2 | 2,627 | 0.016 | 0.009 | 0.002 | 0.010 | 0.019 | 0.053 |
| college_nn3 | 2,627 | 0.018 | 0.009 | 0.002 | 0.013 | 0.022 | 0.054 |
| college_nn4 | 2,627 | 0.020 | 0.009 | 0.002 | 0.015 | 0.024 | 0.056 |
| college_nn5 | 2,627 | 0.022 | 0.010 | 0.002 | 0.016 | 0.025 | 0.058 |
| school_nn1 | 2,627 | 0.007 | 0.004 | 0.0004 | 0.004 | 0.009 | 0.027 |
| school_nn2 | 2,627 | 0.009 | 0.004 | 0.001 | 0.006 | 0.011 | 0.028 |
| school_nn3 | 2,627 | 0.010 | 0.005 | 0.002 | 0.007 | 0.013 | 0.028 |
| school_nn4 | 2,627 | 0.012 | 0.005 | 0.003 | 0.008 | 0.014 | 0.028 |
| school_nn5 | 2,627 | 0.013 | 0.006 | 0.004 | 0.009 | 0.015 | 0.032 |
| hotel_nn1 | 2,627 | 0.010 | 0.006 | 0.0001 | 0.005 | 0.013 | 0.030 |
| hotel_nn2 | 2,627 | 0.011 | 0.007 | 0.0004 | 0.006 | 0.014 | 0.030 |
| hotel_nn3 | 2,627 | 0.012 | 0.007 | 0.001 | 0.007 | 0.015 | 0.031 |
| hotel_nn4 | 2,627 | 0.013 | 0.007 | 0.001 | 0.007 | 0.016 | 0.032 |
| hotel_nn5 | 2,627 | 0.013 | 0.007 | 0.001 | 0.008 | 0.017 | 0.033 |
| pctTotalVacant | 2,627 | 15.068 | 9.325 | 2.237 | 8.509 | 19.776 | 48.308 |
| pctRenterOccupied | 2,627 | 0.657 | 0.271 | 0.142 | 0.460 | 0.773 | 1.480 |
| pctWhite | 2,627 | 0.735 | 0.298 | 0.058 | 0.611 | 0.957 | 1.172 |
| pctPoverty | 2,627 | 0.216 | 0.111 | 0.066 | 0.123 | 0.304 | 0.582 |
| MedRent | 2,540 | 1,099.507 | 402.345 | 245.000 | 836.000 | 1,185.000 | 2,271.000 |
| MedHHInc | 2,627 | 56,581.220 | 43,518.950 | 14,699 | 30,099 | 60,057 | 172,750 |
## 2.3 Spatial Structure: Neighborhood
spatial.feature.list <- all.feature.list %>%
dplyr:: select(LABEL)
as.data.frame(table(spatial.feature.list)) %>%
rename(`Neighborhood List` = spatial.feature.list,
Number =Freq) %>%
kable(caption = 'Table DATA 2.3 Spatial Feature: Neighborhood') %>%
kable_styling("striped", full_width = F)
| Neighborhood List | Number |
|---|---|
| 79th Street | 1 |
| Allapattah Industrial District | 4 |
| Auburndale | 47 |
| Bay Heights | 13 |
| Baypoint | 23 |
| Bayside | 33 |
| Belle Island | 5 |
| Belle Meade | 48 |
| Belle Meade West | 16 |
| Bird Grove East | 10 |
| Bird Grove West | 11 |
| Biscayne Island | 2 |
| Biscayne Plaza | 2 |
| Brentwood | 3 |
| Buena Vista Heights | 19 |
| Buena Vista West | 48 |
| Citrus Grove | 49 |
| Civic Center | 3 |
| Coral Gate | 26 |
| Curtis Park | 13 |
| Douglas Park | 39 |
| East Grove | 53 |
| East Little Havana | 17 |
| Edgewater | 3 |
| Edison | 65 |
| Fair Isle | 15 |
| Flagami | 330 |
| Flora Park | 49 |
| Grove Center | 7 |
| Hadley Park | 128 |
| Haynesworth | 1 |
| Highland Park | 4 |
| Historic Buena Vista East | 10 |
| King Heights | 29 |
| La Pastorita | 7 |
| Latin Quarter | 4 |
| Le Jeune Gardens | 1 |
| Lemon City/Little Haiti | 24 |
| Liberty Square | 40 |
| Little River Central | 14 |
| Little River Gardens | 2 |
| Melrose | 19 |
| Miami Avenue | 15 |
| Morningside | 49 |
| North Grapeland Heights | 43 |
| North Grove | 43 |
| North Sewell Park | 12 |
| Northeast Overtown | 4 |
| Northwestern Estates | 32 |
| Oakland Grove | 1 |
| Old San Juan | 22 |
| Orange Bowl | 3 |
| Orchard Villa | 28 |
| Palm Grove | 4 |
| Parkdale North | 16 |
| Parkdale South | 17 |
| Roads | 88 |
| San Marco Island | 3 |
| Santa Clara | 44 |
| Shenandoah North | 95 |
| Shenandoah South | 93 |
| Shorecrest | 64 |
| Silver Bluff | 93 |
| South Grapeland Heights | 21 |
| South Grove | 128 |
| South Grove Bayside | 27 |
| South Sewell Park | 19 |
| Spring Garden | 3 |
| Vizcaya | 1 |
| West Grapeland Heights | 13 |
| West Grove | 32 |
# DATA - 3 Correlation Matrix
miami.numericVars.internal <-
select_if(internal.feature.list, is.numeric) %>%
na.omit()
amenity.feature.list.with.price <- all.feature.list %>%
dplyr:: select(SalePrice, starts_with('landmark_'),starts_with('mall_'),starts_with('hospital_'),
starts_with('college_'),starts_with('school_'),starts_with('hotel_'),
pctTotalVacant, pctRenterOccupied, pctWhite,pctPoverty, MedRent ,MedHHInc)
miami.numericVars.amenity <-
select_if(amenity.feature.list.with.price, is.numeric) %>%
na.omit()
ggcorrplot(
round(cor(miami.numericVars.internal), 1),
p.mat = cor_pmat(miami.numericVars.internal),
colors = c('#05A167', "white", '#6897BB'),
lab_size = 1,
tl.cex = 8,
type="lower",
insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1) +
labs(title = "Correlation across numeric variables\n(internal features)\n",
caption = 'Figure DATA 3.1') +
plotTheme()
ggcorrplot(
round(cor(miami.numericVars.amenity), 1),
p.mat = cor_pmat(miami.numericVars.amenity),
colors = c('#05A167', "white", '#6897BB'),
lab_size = 1,
tl.cex = 5,
type="lower",
insig = c("pch", "blank"), pch = 1, pch.col = "black", pch.cex =1) +
labs(title = "Correlation across numeric variables\n(amenity/public service features)\n",
caption = 'Figure DATA 3.2') +
plotTheme()
In this section, we specifically explored the correlation between sale price and four interesting and meaningful features: pctTotalVacant, AdjustSqFt, school_nn4 (average distance of four nearest public schools) and hospital_nn1 (the accessibility to nearest hospital).
Through graphs below, it is obvious that the first three of four features are positively correlated to sale price, while the distance to nearest hospital is negatively correlated to the sale price that is consistent with our intuition.
# DATA - 4 Scatter plot
## 1. Two House Internal characteristics
features <- c('Adjusted Square Feet', 'Percentage of Vacant Property')
names(features) <- c("AdjustedSqFt", "pctTotalVacant")
price.house.plot <-
st_drop_geometry(miami.training) %>%
dplyr::select(SalePrice, pctTotalVacant, AdjustedSqFt) %>%
filter(SalePrice <= 1000000) %>%
gather(Variable, Value, -SalePrice) %>%
ggplot(aes(Value, SalePrice)) +
geom_point(size = .5) +
geom_smooth(method = "lm", se=F, colour = "darkred") +
facet_wrap(~Variable, ncol = 2, scales = "free",
labeller = labeller(Variable = features)) +
plotTheme() +
labs(title = "Price as a Function of House Internal Characteristics Variables:\nVacant Property, Adjusted Housing Area\n",
caption = 'Figure DATA 4.1')
price.house.plot
## 2. Average Distance of Nearest Four Public Schools Correlation
price.school_nn4.plot <-
st_drop_geometry(miami.training) %>%
dplyr::select(SalePrice, school_nn4) %>%
filter(SalePrice <= 1000000) %>%
gather(Variable, Value, -SalePrice) %>%
ggplot(aes(Value, SalePrice)) +
geom_point(size = .5) +
geom_smooth(method = "lm", se=F, colour = "darkred") +
labs(title = "Price as a Function of Average Distance of\nNearest Four Public Schools Correlation\n",
caption = 'Figure DATA 4.2') +
plotTheme()
price.school_nn4.plot
## 3. Hospital Correlation
price.hospital_nn1.plot <-
st_drop_geometry(miami.training) %>%
dplyr::select(SalePrice, hospital_nn1) %>%
filter(SalePrice <= 1000000) %>%
gather(Variable, Value, -SalePrice) %>%
ggplot(aes(Value, SalePrice)) +
geom_point(size = .5) +
geom_smooth(method = "lm", se=F, colour = "darkred") +
labs(title = "Price as a Function of Average Distance of\nNearest Four Hospitals Correlation\n",
caption = 'Figure DATA 4.3') +
plotTheme()
price.hospital_nn1.plot
# DATA - 5 Map of Sale Price per Square Feet
miami.training <- miami.training %>%
mutate(priceFt = SalePrice/LivingSqFt)
map.priceFt <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = st_centroid(miami.training),aes(color = q5(priceFt)),size = .5) +
scale_color_manual(values = palette,
labels = qBr(miami.training,'priceFt'),
name = "Price/ft^2") +
labs(title = 'Sale Price Per Square Foot\n',
caption = 'Figure DATA 5') +
mapTheme() +
plotTheme()
map.priceFt
In this section, we were aimed at mapping the vacant units proportion by each census tract where certain home sale observation is located at, the adjusted area of each home sale observation and the distribution of average distance of each home sale observation to the nearest hospital.
# DATA - 6
## 1. Vacant units
map.pctV <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = st_centroid(miami.training),aes(color = q5(pctTotalVacant)),size = .5) +
scale_color_manual(values = palette,
labels = qBr(miami.training,'pctTotalVacant'),
name = "Percentage of Vacant Units") +
labs(title = 'Percentage of Vacant Units',
subtitle = 'Note: dots indicating the vacant unts proportion in certain census tract\nwhere certain home sale observation is located at.\n',
caption = 'Figure DATA 6.1') +
mapTheme() +
plotTheme()
map.pctV
## 2. Adjusted Square Feet
map.Adjust.SqFt <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = st_centroid(miami.training),aes(color = q5(AdjustedSqFt)),size = .5) +
scale_color_manual(values = palette,
labels = qBr(miami.training,'AdjustedSqFt'),
name = "Adjusted Square Feet") +
labs(title = 'Distribution of Adjusted Square Feet\n',
caption = 'Figure DATA 6.2') +
mapTheme() +
plotTheme()
map.Adjust.SqFt
## 3. Hospital
hospital_nn1_expanded <- miami.training %>%
mutate(hospital_nn1.expanded = hospital_nn1 * 1000)
map.hospital <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = st_centroid(hospital_nn1_expanded),aes(color = q5(hospital_nn1.expanded)),size = .5) +
scale_color_manual(values = palette,
labels = qBr(hospital_nn1_expanded,'hospital_nn1.expanded'),
name = "Average Distrance to Nearest Hospital\n(timed by 1000 for better visualization)\n") +
labs(title = 'Distribution of Average Distrance to Nearest Hospital\nof Each Home Sale Observation\n',
caption = 'Figure DATA 6.3') +
mapTheme() +
plotTheme()
map.hospital
The mean objective of this analysis is to train a robust model to predict house price as accurately as possible. Accordingly, this section started to provide a detailed interpretation on the process of the model building.
Data and Sample In order to accurately build and test models, we randomly split the whole miami.training into two parts: model.miami.training with 60% of observations for model training and model.miami.test with 40% of observations for model testing.
Dependent Variable The dependent variable is home sale price.
Independent Variables Based on the hedonic home price model, potential features should encompass at least three components: internal characteristics, like the number of bathrooms; neighborhood amenities/public services, like hospital accessibility; and the underlying spatial process of prices. After creating features and engineering and checking correlation coefficients, 54 features are eligible for the model and obviously or slightly correlated with SalePrice that were the initial independent variables for model building. We gradually screened out variables and eventually selected only 16 robust features included in the final model
Procedure The primitive approach used in this analysis for model building is Ordinary Least Squares Regression (OLS) which is aimed at predicting the SalePrice of houses as a function of several statistical parameters such as intercept, slope and selected features. In order to determine which features were able to significantly increase the accuracy and generalizability of the model, we systematically compared, plotted, and mapping the mean absolute errors (MAE), mean absolute percent errors (MAPE), as well as root mean square errors (RMSE, the standard deviation of the residuals) between original SalePrice and predicted SalePrice of both training set and test set, conducted the cross-validation on 100 folds, explored the underlying spatial pattern of price and errors caused by problematic prediction. To address the problem of neglecting the spatial pattern, we then compared, plotted, and mapped the mean absolute errors (MAE), and mean absolute percent errors (MAPE) between original SalePrice and predicted SalePrice of test set by neighborhood.
# 1. Split dataset
set.seed(31357)
# get index for training sample
inTrain <- caret::createDataPartition(
y = paste(miami.training$LABEL,miami.training$SalePrice ),
p = .60, list = FALSE)
# split data into training and test
model.miami.training <- miami.training[inTrain,]
model.miami.test <- miami.training[-inTrain,]
## First model: All internal features, amenity features
M1 <- lm(SalePrice ~ ., data = st_drop_geometry(model.miami.training) %>%
dplyr::select(colnames(internal.feature.list),
colnames(amenity.feature.list)
))
stargazer(M1, type = "html",
title = "Summary Statistics of Model 1 ",
header = FALSE,
single.row = TRUE)
| Dependent variable: | |
| SalePrice | |
| Land | 0.991*** (0.021) |
| Bldg | 1.093*** (0.023) |
| Total | |
| Assessed | 0.517*** (0.112) |
| County.Taxable | -0.303*** (0.108) |
| City.Taxable | |
| AdjustedSqFt | -159.901*** (37.232) |
| LotSize | -13.966*** (1.849) |
| Bed | -11,249.950 (6,997.370) |
| Bath | 17,335.080** (7,506.400) |
| Stories | -10,399.910 (13,662.860) |
| Units | -53,679.120 (55,759.680) |
| LivingSqFt | 33.200 (23.408) |
| ActualSqFt | 134.091*** (19.856) |
| Age | 252.681 (243.817) |
| Age.effective | -63.247 (188.533) |
| pool_house | 14,246.280 (13,530.970) |
| patio_house | -4,738.318 (11,416.990) |
| landmark_nn1 | -2,792,163.000 (2,351,722.000) |
| landmark_nn2 | 16,876,972.000** (7,583,331.000) |
| landmark_nn3 | -31,296,437.000*** (12,121,768.000) |
| landmark_nn4 | 23,404,861.000 (19,773,533.000) |
| landmark_nn5 | -6,786,591.000 (12,738,600.000) |
| mall_nn1 | 176,783.400 (1,845,897.000) |
| mall_nn2 | -4,287,708.000 (4,432,823.000) |
| mall_nn3 | 5,844,359.000 (5,697,162.000) |
| mall_nn4 | 496,030.000 (8,900,686.000) |
| mall_nn5 | -2,054,266.000 (5,713,360.000) |
| hospital_nn1 | -1,818,032.000 (1,888,218.000) |
| hospital_nn2 | 4,421,767.000 (4,345,964.000) |
| hospital_nn3 | -4,954,592.000 (8,681,228.000) |
| hospital_nn4 | -1,725,413.000 (13,318,517.000) |
| hospital_nn5 | 3,390,553.000 (8,335,588.000) |
| college_nn1 | -4,720,098.000* (2,629,573.000) |
| college_nn2 | 5,923,026.000 (5,873,775.000) |
| college_nn3 | -7,233,703.000 (12,571,717.000) |
| college_nn4 | 4,070,657.000 (17,754,491.000) |
| college_nn5 | 669,506.300 (10,417,329.000) |
| school_nn1 | -687,049.800 (3,421,098.000) |
| school_nn2 | 3,194,391.000 (6,890,515.000) |
| school_nn3 | -23,921,011.000** (11,137,220.000) |
| school_nn4 | 40,568,356.000** (17,839,289.000) |
| school_nn5 | -18,838,280.000 (12,109,364.000) |
| hotel_nn1 | -1,220,964.000 (4,501,937.000) |
| hotel_nn2 | 9,678,923.000 (12,125,473.000) |
| hotel_nn3 | -2,871,359.000 (21,318,716.000) |
| hotel_nn4 | -27,767,343.000 (31,010,023.000) |
| hotel_nn5 | 21,457,163.000 (18,334,682.000) |
| pctTotalVacant | 963.610 (863.743) |
| pctRenterOccupied | 6,812.634 (24,174.830) |
| pctWhite | 19,574.130 (38,267.100) |
| pctPoverty | 26,776.040 (81,935.820) |
| MedRent | -61.026** (27.140) |
| MedHHInc | 0.592* (0.311) |
| Constant | 118,214.900 (91,423.930) |
| Observations | 2,378 |
| R2 | 0.992 |
| Adjusted R2 | 0.991 |
| Residual Std. Error | 196,411.800 (df = 2325) |
| F Statistic | 5,280.829*** (df = 52; 2325) |
| Note: | p<0.1; p<0.05; p<0.01 |
## Second model:
M2 <- lm(SalePrice ~ ., data = st_drop_geometry(model.miami.training) %>%
dplyr::select(# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
landmark_nn2, landmark_nn3,
hospital_nn1, hospital_nn3, hospital_nn4,
college_nn1, school_nn3, school_nn4, school_nn5,
pctTotalVacant, pctRenterOccupied, MedRent, MedHHInc)
)
stargazer(M2, type = "html",
title = "Summary Statistics of Model 2 ",
header = FALSE,
single.row = TRUE)
| Dependent variable: | |
| SalePrice | |
| Land | 0.985*** (0.021) |
| Bldg | 1.082*** (0.022) |
| Assessed | 0.507*** (0.110) |
| County.Taxable | -0.288*** (0.106) |
| AdjustedSqFt | -118.404*** (23.316) |
| LotSize | -13.279*** (1.726) |
| Bed | -13,051.200* (6,836.993) |
| Bath | 17,439.450** (7,183.896) |
| ActualSqFt | 127.631*** (18.325) |
| landmark_nn2 | 1,799,788.000 (3,559,741.000) |
| landmark_nn3 | -2,635,938.000 (3,763,173.000) |
| hospital_nn1 | -2,196,553.000** (933,375.700) |
| hospital_nn3 | 3,016,274.000 (3,704,837.000) |
| hospital_nn4 | -1,101,879.000 (3,527,266.000) |
| college_nn1 | -1,973,853.000*** (731,082.300) |
| school_nn3 | -28,006,369.000*** (7,261,776.000) |
| school_nn4 | 45,941,758.000*** (15,793,927.000) |
| school_nn5 | -17,237,124.000* (9,831,561.000) |
| pctTotalVacant | 669.337 (634.989) |
| pctRenterOccupied | 14,496.700 (21,090.990) |
| MedRent | -46.688** (20.841) |
| MedHHInc | 0.541** (0.244) |
| Constant | 74,872.530*** (28,563.910) |
| Observations | 2,378 |
| R2 | 0.992 |
| Adjusted R2 | 0.991 |
| Residual Std. Error | 196,282.500 (df = 2355) |
| F Statistic | 12,497.180*** (df = 22; 2355) |
| Note: | p<0.1; p<0.05; p<0.01 |
# Third Model
M3 <- lm(SalePrice ~ ., data = st_drop_geometry(model.miami.training) %>%
dplyr::select(# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
college_nn1,
school_nn3, school_nn4, school_nn5,
pctTotalVacant, MedRent, MedHHInc)
)
stargazer(M3, type = "html",
title = "Summary Statistics of Model 3 ",
header = FALSE,
single.row = TRUE)
| Dependent variable: | |
| SalePrice | |
| Land | 0.985*** (0.021) |
| Bldg | 1.081*** (0.022) |
| Assessed | 0.499*** (0.110) |
| County.Taxable | -0.281*** (0.106) |
| AdjustedSqFt | -117.790*** (23.306) |
| LotSize | -13.032*** (1.713) |
| Bed | -12,423.740* (6,833.118) |
| Bath | 18,002.580** (7,169.642) |
| ActualSqFt | 126.495*** (18.312) |
| college_nn1 | -2,236,855.000*** (707,102.700) |
| school_nn3 | -28,465,266.000*** (6,919,330.000) |
| school_nn4 | 44,475,526.000*** (15,632,117.000) |
| school_nn5 | -15,908,357.000* (9,572,297.000) |
| pctTotalVacant | 1,037.946* (585.099) |
| MedRent | -38.722** (18.400) |
| MedHHInc | 0.479** (0.200) |
| Constant | 67,019.810*** (22,540.550) |
| Observations | 2,378 |
| R2 | 0.991 |
| Adjusted R2 | 0.991 |
| Residual Std. Error | 196,340.900 (df = 2361) |
| F Statistic | 17,172.940*** (df = 16; 2361) |
| Note: | p<0.1; p<0.05; p<0.01 |
## Below is our baseline regression model
m2.training <- M3
# 2. Table of model summary on my training set
stargazer(m2.training, type = "html",
title = "Summary Statistics of Baseline Model on Training Set",
header = FALSE,
single.row = TRUE)
| Dependent variable: | |
| SalePrice | |
| Land | 0.985*** (0.021) |
| Bldg | 1.081*** (0.022) |
| Assessed | 0.499*** (0.110) |
| County.Taxable | -0.281*** (0.106) |
| AdjustedSqFt | -117.790*** (23.306) |
| LotSize | -13.032*** (1.713) |
| Bed | -12,423.740* (6,833.118) |
| Bath | 18,002.580** (7,169.642) |
| ActualSqFt | 126.495*** (18.312) |
| college_nn1 | -2,236,855.000*** (707,102.700) |
| school_nn3 | -28,465,266.000*** (6,919,330.000) |
| school_nn4 | 44,475,526.000*** (15,632,117.000) |
| school_nn5 | -15,908,357.000* (9,572,297.000) |
| pctTotalVacant | 1,037.946* (585.099) |
| MedRent | -38.722** (18.400) |
| MedHHInc | 0.479** (0.200) |
| Constant | 67,019.810*** (22,540.550) |
| Observations | 2,378 |
| R2 | 0.991 |
| Adjusted R2 | 0.991 |
| Residual Std. Error | 196,340.900 (df = 2361) |
| F Statistic | 17,172.940*** (df = 16; 2361) |
| Note: | p<0.1; p<0.05; p<0.01 |
broom::glance(m2.training) %>%
kable(caption = 'Summary of Model Evaluation Parameters') %>%
kable_styling("striped", full_width = F)
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.9914805 | 0.9914227 | 196340.9 | 17172.93 | 0 | 16 | -32347.84 | 64731.67 | 64835.61 | 9.101599e+13 | 2361 | 2378 |
# 3. MAE, MAPE on my test set
m2.miami.test <-
model.miami.test %>%
mutate(Regression = "Baseline Regression",
SalePrice.Predict = predict(m2.training, newdata = model.miami.test),
SalePrice.Error = SalePrice - SalePrice.Predict,
SalePrice.AbsError = abs(SalePrice - SalePrice.Predict),
SalePrice.APE = (abs(SalePrice - SalePrice.Predict)) / SalePrice) %>%
filter(SalePrice < 5000000)
test_GoodnessFit<-as.data.frame(cbind(mean(m2.miami.test$SalePrice.AbsError,na.rm = T),
mean(m2.miami.test$SalePrice.APE,na.rm = T)))
colnames(test_GoodnessFit)<-c("Mean Absolute Errors (MAE)","MAPE")
kable(test_GoodnessFit,
caption = 'Table RESULT 3. MAE and MAPE for Test Set') %>%
kable_styling("striped", full_width = F)
| Mean Absolute Errors (MAE) | MAPE |
|---|---|
| 59645.31 | 0.1236198 |
# 4. Cross-Validation Test on Model 2
## 4.1 use caret package cross-validation method
fitControl <- caret:: trainControl(method = "cv",
number = 100,
savePredictions = TRUE)
set.seed(717)
m2.cv <-
caret::train(SalePrice ~ ., data = st_drop_geometry(miami.training) %>%
dplyr::select(# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
college_nn1,
school_nn3, school_nn4, school_nn5,
pctTotalVacant, MedRent, MedHHInc) ,
method = "lm",
trControl = fitControl,
na.action = na.pass)
## 4.2 show RMSE, MAE, R^2
kable(m2.cv$resample,
caption = 'Table RESULT 4. Cross-validation Test: Summary of RMSE, R Squared and MAE') %>%
kable_styling("striped", full_width = F)
| RMSE | Rsquared | MAE | Resample |
|---|---|---|---|
| 198139.61 | 0.9853120 | 119795.03 | Fold001 |
| 60086.39 | 0.9960435 | 48432.08 | Fold002 |
| 122823.07 | 0.9900892 | 76756.16 | Fold003 |
| 207097.01 | 0.9704187 | 96194.43 | Fold004 |
| 162566.98 | 0.9895799 | 98348.59 | Fold005 |
| 199639.82 | 0.9986516 | 95787.21 | Fold006 |
| 731604.37 | 0.9949180 | 308338.47 | Fold007 |
| 127088.59 | 0.9976789 | 72499.21 | Fold008 |
| 196071.62 | 0.9895360 | 97195.16 | Fold009 |
| 239853.33 | 0.9599551 | 110998.29 | Fold010 |
| 186646.45 | 0.9458751 | 104977.37 | Fold011 |
| 82553.65 | 0.9910639 | 60294.86 | Fold012 |
| 77518.37 | 0.9974416 | 58943.17 | Fold013 |
| 412016.70 | 0.9864821 | 150361.53 | Fold014 |
| 205209.63 | 0.9710427 | 104502.83 | Fold015 |
| 90037.17 | 0.9876723 | 61543.14 | Fold016 |
| 138328.50 | 0.9949880 | 84990.71 | Fold017 |
| 84309.07 | 0.9945610 | 60266.43 | Fold018 |
| 104292.57 | 0.9485969 | 65590.16 | Fold019 |
| 139049.40 | 0.9931974 | 84661.85 | Fold020 |
| 213424.92 | 0.8892981 | 98464.62 | Fold021 |
| 97822.97 | 0.9947580 | 64024.16 | Fold022 |
| 129640.96 | 0.9617291 | 79505.35 | Fold023 |
| 133704.88 | 0.9962616 | 70747.15 | Fold024 |
| 178009.60 | 0.9994841 | 115080.18 | Fold025 |
| 141117.31 | 0.9957801 | 87553.03 | Fold026 |
| 596292.62 | 0.9927644 | 182950.18 | Fold027 |
| 371061.62 | 0.8329185 | 136399.18 | Fold028 |
| 154420.55 | 0.9829284 | 84976.60 | Fold029 |
| 94905.69 | 0.9803618 | 59535.60 | Fold030 |
| 92959.86 | 0.9825883 | 68315.44 | Fold031 |
| 110161.30 | 0.9994238 | 73494.07 | Fold032 |
| 187838.59 | 0.9973274 | 111943.09 | Fold033 |
| 194452.12 | 0.9974584 | 102532.76 | Fold034 |
| 137632.83 | 0.9875680 | 98998.44 | Fold035 |
| 86774.46 | 0.9632703 | 70697.19 | Fold036 |
| 139760.85 | 0.9947746 | 81720.61 | Fold037 |
| 103178.62 | 0.9834567 | 76393.19 | Fold038 |
| 74894.92 | 0.9997241 | 50002.23 | Fold039 |
| 151937.04 | 0.9637194 | 83080.49 | Fold040 |
| 105624.15 | 0.9840339 | 82284.21 | Fold041 |
| 362108.14 | 0.9456223 | 155438.69 | Fold042 |
| 136263.54 | 0.9755882 | 98364.83 | Fold043 |
| 166337.59 | 0.9475920 | 112916.77 | Fold044 |
| 105441.35 | 0.9991632 | 76498.76 | Fold045 |
| 144510.48 | 0.9942572 | 91785.92 | Fold046 |
| 105413.46 | 0.9966984 | 70204.28 | Fold047 |
| 300114.49 | 0.9960530 | 134121.91 | Fold048 |
| 171303.30 | 0.9976148 | 85137.17 | Fold049 |
| 107595.16 | 0.9365308 | 81378.68 | Fold050 |
| 122186.91 | 0.9986492 | 81565.15 | Fold051 |
| 169460.14 | 0.9375060 | 108010.98 | Fold052 |
| 215624.41 | 0.9653245 | 125246.01 | Fold053 |
| 123012.37 | 0.9997520 | 79336.06 | Fold054 |
| 80035.26 | 0.9946228 | 57943.16 | Fold055 |
| 162030.20 | 0.9565660 | 91569.13 | Fold056 |
| 141797.61 | 0.9892701 | 90537.01 | Fold057 |
| 242441.20 | 0.9926873 | 93936.52 | Fold058 |
| 94436.06 | 0.9777007 | 58001.11 | Fold059 |
| 270521.03 | 0.9977142 | 107391.70 | Fold060 |
| 85328.28 | 0.9926478 | 64009.44 | Fold061 |
| 112068.01 | 0.9963280 | 67084.45 | Fold062 |
| 70130.07 | 0.9647300 | 50401.73 | Fold063 |
| 90925.92 | 0.9983749 | 64357.41 | Fold064 |
| 110789.51 | 0.9687090 | 64734.13 | Fold065 |
| 80713.56 | 0.9877941 | 62576.56 | Fold066 |
| 250286.38 | 0.9825317 | 123288.27 | Fold067 |
| 72278.11 | 0.9994246 | 57037.43 | Fold068 |
| 225347.54 | 0.9413927 | 117897.92 | Fold069 |
| 168131.35 | 0.9934126 | 80147.15 | Fold070 |
| 152362.98 | 0.9412925 | 81758.69 | Fold071 |
| 367494.49 | 0.9844215 | 128898.98 | Fold072 |
| 91232.84 | 0.9859311 | 57671.03 | Fold073 |
| 86841.99 | 0.9902169 | 60353.74 | Fold074 |
| 101498.28 | 0.9646062 | 74243.32 | Fold075 |
| 217979.38 | 0.9986084 | 132059.32 | Fold076 |
| 120466.03 | 0.9970362 | 80202.54 | Fold077 |
| 158587.14 | 0.9678984 | 89297.14 | Fold078 |
| 121976.12 | 0.9976592 | 83994.63 | Fold079 |
| 175696.23 | 0.9867851 | 102838.63 | Fold080 |
| 109927.21 | 0.9970085 | 71705.27 | Fold081 |
| 210105.20 | 0.9416603 | 119954.07 | Fold082 |
| 313337.30 | 0.9949994 | 110378.88 | Fold083 |
| 331910.00 | 0.9987171 | 157354.12 | Fold084 |
| 367224.18 | 0.9917536 | 173486.56 | Fold085 |
| 112781.76 | 0.9995143 | 82741.16 | Fold086 |
| 141576.60 | 0.9709638 | 86515.32 | Fold087 |
| 127408.69 | 0.9980478 | 80797.91 | Fold088 |
| 160226.58 | 0.9642931 | 100449.11 | Fold089 |
| 205442.75 | 0.9604512 | 113785.68 | Fold090 |
| 75554.76 | 0.9893433 | 61585.62 | Fold091 |
| 113371.01 | 0.9836034 | 72270.88 | Fold092 |
| 233922.94 | 0.9916611 | 115189.21 | Fold093 |
| 147770.83 | 0.9955132 | 85004.00 | Fold094 |
| 270069.39 | 0.9994397 | 129313.17 | Fold095 |
| 190135.49 | 0.9961020 | 128383.47 | Fold096 |
| 159917.75 | 0.9963515 | 76691.39 | Fold097 |
| 407381.55 | 0.9961371 | 143143.95 | Fold098 |
| 78599.00 | 0.9729700 | 53994.23 | Fold099 |
| 125683.41 | 0.9740752 | 86213.44 | Fold100 |
m2.cv$resample %>%
pivot_longer(-Resample) %>%
mutate(name = as.factor(name)) %>%
ggplot(., aes(x = name, y = value, color = name)) +
geom_jitter(width = 0.1) +
facet_wrap(~name, ncol = 3, scales = "free") +
theme_bw() +
theme(legend.position = "none") +
labs(title = 'Cross_validation Test: Distribution of MAE, RMSE, R Squared\n',
caption = "Figure RESULT 4.1") +
plotTheme()
ggplot(data = m2.cv$resample) +
geom_histogram(aes(x = m2.cv$resample$MAE), fill = 'orange') +
labs(title="Distribution of Cross-validation MAE\n",
subtitle = "K = 100",
caption = "Figure RESULT 4.2") +
xlab('MAE of Model 2') +
ylab('Count') +
plotTheme()
# If the model generalized well, the distribution of errors would cluster tightly together.
# Instead, this range of errors suggests the model predicts inconsistently, and would likely be unreliable for predicting houses that have not recently sold.
# 5. Plot predicted prices as a function of observed prices
preds.train <- data.frame(pred = predict(m2.training, model.miami.training),
actual = model.miami.training$SalePrice,
source = "training data")
preds.test <- data.frame(pred = predict(m2.training,newdata = model.miami.test),
actual = model.miami.test$SalePrice,
source = "test data")
preds <- rbind(preds.train, preds.test)
## Overall Plotting
ggplot(preds, aes(x = actual, y = pred, color = source)) +
geom_point() +
geom_smooth(method = "lm", color = "darkblue", ) +
geom_abline(color = "orange") +
coord_equal() +
theme_bw() +
labs(title = "Predicted Prices as a Function of Observed Prices\n",
caption = 'Figure RESULT 5.1',
x = "Observed Prices",
y = "Predicted Prices") +
theme(
legend.position = "none"
) +
plotTheme()
ggplot(preds, aes(x = actual, y = pred, color = source)) +
geom_point() +
geom_smooth(method = "lm", color = "green") +
geom_abline(color = "orange") +
coord_equal() +
theme_bw() +
facet_wrap(~source, ncol = 2) +
labs(title = "Predicted Prices as a Function of Observed Prices, by Sets\n",
caption = 'Figure RESULT 5.2',
x = "Observed Prices",
y = "Predicted Prices") +
theme(
legend.position = "none"
) +
plotTheme()
# 6. Residual Exploration
## 6.1 A map of residuals for test set
map.res.test <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = m2.miami.test,aes(color = q5(SalePrice.AbsError)), show.legend = "point",size = 1) +
scale_color_manual(values = palette,
labels = qBr(m2.miami.test,'SalePrice.AbsError'),
name = "Residuals") +
labs(title = 'Map of residuals for test set\n',
caption = 'Figure 6.1') +
mapTheme() +
plotTheme()
map.res.test
## 6.2 Spatial lag in errors: average errors in five nearest neighbors
k_nearest_neighbors = 5
# average errors in five nearest neighbors
coords.test <- st_centroid(st_geometry(m2.miami.test), of_largest_polygon=TRUE)
neighborList.test <- knn2nb(knearneigh(coords.test, k_nearest_neighbors))
spatialWeights.test <- nb2listw(neighborList.test, style="W")
m2.miami.test$lagPriceError <- lag.listw(spatialWeights.test,m2.miami.test$SalePrice.Error)
error.spatial.lag <- ggplot(m2.miami.test, aes(x = lagPriceError, y = SalePrice)) +
geom_point(colour = "#6897BB") +
geom_smooth(method = "lm", se = FALSE, colour = "#05A167") +
labs(title = "Error on Test Set as a function of the Spatial Lag of Price",
caption = "Figure RESULT 6.2",
x = "Spatial lag of errors (Mean error of 5 nearest neighbors)",
y = "Sale Price") +
plotTheme()
error.spatial.lag
## 6.3 Moran's I Test
moranTest <- moran.mc(m2.miami.test$SalePrice.AbsError,
spatialWeights.test, nsim = 999)
moran.plot <- ggplot(as.data.frame(moranTest$res[c(1:999)]), aes(moranTest$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = moranTest$statistic), colour = "#6897BB",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Observed and permuted Moran's I",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count",
caption="Figure Result 6.3") +
plotTheme()
moran.plot
# 7. Map of predicted values for the whole dataset
miami.sf$Predicted.Price<- predict(m2.training, newdata = miami.sf)
map.predicted.price <- ggplot() +
geom_sf(data = st_union(miami.base),fill = 'grey') +
geom_sf(data = st_centroid(miami.sf),aes(color = q5(Predicted.Price)),size = .5) +
scale_color_manual(values = palette,
labels = qBr(miami.sf,'Predicted.Price'),
name = "Price, $") +
labs(title = 'Map of Predicted Sale Price\n',
subtitle = '',
caption = 'Figure RESULT 7') +
mapTheme() +
plotTheme()
map.predicted.price
# 8. Map of MAPE by neighborhood
## 8.1 Statistic summary by neighborhood group
nhood_sum <- m2.miami.test %>%
group_by(LABEL) %>%
summarize(meanPrice = mean(SalePrice, na.rm = T),
meanPrediction = mean(SalePrice.Predict, na.rm = T),
meanMAE = mean(SalePrice.AbsError, na.rm = T),
meanMAPE = mean(SalePrice.APE, na.rm = T))
nhood_sum %>%
st_drop_geometry %>%
arrange(desc(meanMAPE)) %>%
kable(caption = 'Table RESULT 8. Map of MAPE by Neighborhood') %>%
kable_styling("striped", full_width = F)
| LABEL | meanPrice | meanPrediction | meanMAE | meanMAPE |
|---|---|---|---|---|
| Belle Meade | 525000.0 | 693827.2 | 168827.210 | 0.3215756 |
| Flora Park | 235000.0 | 173195.4 | 61804.585 | 0.2629982 |
| Hadley Park | 218181.8 | 220490.7 | 40196.438 | 0.2629211 |
| Auburndale | 315000.0 | 394983.8 | 79983.783 | 0.2515129 |
| King Heights | 247500.0 | 186553.2 | 60946.762 | 0.2426961 |
| Douglas Park | 330000.0 | 395485.9 | 65485.918 | 0.1984422 |
| South Grove | 1089285.7 | 1065505.8 | 196290.348 | 0.1943366 |
| Edison | 225000.0 | 245860.4 | 39177.641 | 0.1635606 |
| Buena Vista West | 282500.0 | 241985.2 | 40514.803 | 0.1379306 |
| Flagami | 348550.7 | 341875.8 | 41095.196 | 0.1176901 |
| Shenandoah South | 435000.0 | 414730.8 | 40233.809 | 0.0980084 |
| Shenandoah North | 425000.0 | 447634.1 | 35817.841 | 0.0978073 |
| Silver Bluff | 437500.0 | 468627.9 | 39720.576 | 0.0870146 |
| NA | 1542931.0 | 1573447.3 | 109346.042 | 0.0858543 |
| Citrus Grove | 316666.7 | 290469.0 | 26197.670 | 0.0856725 |
| La Pastorita | 380000.0 | 348617.1 | 31382.939 | 0.0825867 |
| Liberty Square | 220000.0 | 201958.8 | 18041.217 | 0.0820055 |
| Northwestern Estates | 190000.0 | 202313.3 | 12313.323 | 0.0648070 |
| West Grove | 550000.0 | 584857.9 | 34857.907 | 0.0633780 |
| Roads | 600000.0 | 565369.8 | 34630.229 | 0.0577170 |
| North Grapeland Heights | 290000.0 | 301803.1 | 11803.125 | 0.0407004 |
| Shorecrest | 638333.3 | 656533.8 | 18200.425 | 0.0315013 |
| Santa Clara | 220000.0 | 226119.9 | 6119.898 | 0.0278177 |
| Melrose | 230000.0 | 224225.4 | 5774.562 | 0.0251068 |
| Baypoint | 1800000.0 | 1783610.7 | 16389.297 | 0.0091052 |
## 8.2 Map of MAPE by neighborhood
miami.neigh.sum <- miami.neigh %>%
left_join(st_drop_geometry(nhood_sum), by = "LABEL") %>%
mutate(meanMAPE.expanded = meanMAPE * 100)
map.MAPE.nhood <- ggplot() +
geom_sf(data = miami.neigh.sum,
aes(fill = q5(meanMAPE.expanded))) +
scale_fill_manual(values = palette,
labels = qBr(miami.neigh.sum,'meanMAPE.expanded'),
name = "MAPE\n(timed by 100 for better display)") +
labs(title = 'Map of MAPE on Test Set, by Neighborhood\n',
subtitle = '',
caption = 'Figure RESULT 8') +
mapTheme() +
plotTheme()
map.MAPE.nhood
# 9. Scatter plot of MAPE by neighborhood as a function of mean price by neighborhood
MAPE.nhood.plot <-ggplot()+
geom_point(data = nhood_sum, aes(x = meanPrice, y = meanMAPE)) +
stat_smooth(data = nhood_sum, aes(x = meanPrice, y = meanMAPE), method = "loess", se = FALSE, size = 1, colour="red") +
labs(title = "MAPE by Neighborhood as a Function of Mean Price by Neighborhood\n",
caption = 'Figure RESULT 9') +
xlab('Mean Price by Neighborhood') +
ylab('Mean MAPE by Neighborhood') +
plotTheme()
MAPE.nhood.plot
# 10. Model generalizability on census data
## 10.1 Group Plotting
census <- miami.training %>% # miami.training contains all features we used including census data and geometry information
mutate(raceContext = ifelse(pctWhite > .5, "Majority White", "Majority Non-White"),
incomeContext = ifelse(MedHHInc > mean(MedHHInc,na.rm = T), "High Income", "Low Income"),
vacantContext = ifelse(pctTotalVacant > .5, "Majority Vacant", "Majority Non-Vacant"),
pctRenterContext = ifelse(pctPoverty > .5, "Majority Renter Occupied", "Majority Non-Renter Occupied")) %>%
select(raceContext,incomeContext,vacantContext,pctRenterContext)
census <- census %>%
st_transform(st_crs(tracts17))
grid.arrange(ncol = 2,
ggplot() +
geom_sf(data = st_union(miami.base), fill = 'grey') +
geom_sf(data = na.omit(census), aes(color = raceContext),
show.legend = "point", size= 1) +
scale_color_manual(values = c("#05A167", "#6897BB"), name="Race Context") +
labs(title = "Race Context of House\n") +
mapTheme() +
theme(legend.position="bottom") +
plotTheme(),
ggplot() +
geom_sf(data = st_union(miami.base), fill = 'grey') +
geom_sf(data = na.omit(census), aes(color = incomeContext),
show.legend = "point", size= 1) +
scale_fill_manual(values = c("#05A167", "#6897BB"), name="Income Context") +
labs(title = "Income Context of House\n") +
mapTheme() +
theme(legend.position="bottom") +
plotTheme())
grid.arrange(ncol = 2,
ggplot() +
geom_sf(data = st_union(miami.base), fill = 'grey') +
geom_sf(data = na.omit(census), aes(color = vacantContext),
show.legend = "point", size= 1) +
scale_color_manual(values = c("#05A167", "#6897bb"), name="Vacant Context") +
labs(title = "Vacant Context of House\n") +
mapTheme() +
theme(legend.position="bottom") +
plotTheme(),
ggplot() +
geom_sf(data = st_union(miami.base), fill = 'grey') +
geom_sf(data = na.omit(census), aes(color = pctRenterContext),
show.legend = "point", size= 1) +
scale_fill_manual(values = c("#05A167", "#6897BB"), name="Renter Proportion Context") +
labs(title = "Renter Proportion Context of House\n") +
mapTheme() +
theme(legend.position="bottom") +
plotTheme())
### Based on the group distribution, we decided to select race group and income group
## 10.2 Neighborhood Fixed Effect
### 10.2.1 Model building
m2.nhood <- lm(SalePrice ~ ., data = as.data.frame(model.miami.training) %>%
dplyr::select(# spatial features
LABEL,
# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
college_nn1,
school_nn3, school_nn4, school_nn5,
pctTotalVacant, MedRent, MedHHInc)
)
m2.miami.test.nhood <-
model.miami.test %>%
mutate(Regression = "Neighborhood Effects",
SalePrice.Predict = predict(m2.nhood, model.miami.test),
SalePrice.Error = SalePrice - SalePrice.Predict,
SalePrice.AbsError = abs(SalePrice - SalePrice.Predict),
SalePrice.APE = (abs(SalePrice - SalePrice.Predict)) / SalePrice) %>%
filter(SalePrice < 5000000)
### 10.2.2. Model combination and comparison
bothRegressions <-
rbind(
dplyr::select(m2.miami.test, starts_with("SalePrice"), Regression, LABEL) ,
dplyr::select(m2.miami.test.nhood, starts_with("SalePrice"), Regression, LABEL) )
st_drop_geometry(bothRegressions) %>%
gather(Variable, Value, -Regression, -LABEL) %>%
filter(Variable == "SalePrice.AbsError" | Variable == "SalePrice.APE") %>%
group_by(Regression, Variable) %>%
summarize(meanValue = mean(Value, na.rm = T)) %>%
spread(Variable, meanValue) %>%
kable(caption = 'Table RESULT 10.1.\nSummary of MAE, and MAPE by Regression Models') %>%
kable_styling("striped", full_width = F) %>%
row_spec(1, color = "black", background = "#b2eed7") %>%
row_spec(2, color = "black", background = '#6897BB')
| Regression | SalePrice.AbsError | SalePrice.APE |
|---|---|---|
| Baseline Regression | 59645.31 | 0.1236198 |
| Neighborhood Effects | 47095.86 | 0.1246893 |
### 10.2.3. Model coefficents for each Neighborhood
tidy(m2.nhood) %>%
filter(str_detect(term, "LABEL")) %>%
kable(caption = " RESULT 10.2.\nModel Coefficents for Each Neighborhood") %>%
kable_styling("striped", full_width = F)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| LABELAllapattah Industrial District | -1.455955e+05 | 138467.7 | -1.0514767 | 0.2931701 |
| LABELAuburndale | -2.453581e+04 | 125126.1 | -0.1960886 | 0.8445613 |
| LABELBaypoint | 6.828966e+04 | 130304.4 | 0.5240777 | 0.6002841 |
| LABELBayside | -2.647159e+04 | 128577.4 | -0.2058806 | 0.8369057 |
| LABELBelle Island | 2.697437e+05 | 139362.7 | 1.9355514 | 0.0530672 |
| LABELBelle Meade | 1.424062e+04 | 128106.8 | 0.1111621 | 0.9114993 |
| LABELBelle Meade West | -1.097660e+04 | 130002.0 | -0.0844341 | 0.9327200 |
| LABELBird Grove East | 4.070278e+04 | 132166.2 | 0.3079667 | 0.7581406 |
| LABELBird Grove West | -7.011467e+04 | 130690.2 | -0.5364953 | 0.5916775 |
| LABELBiscayne Island | -1.440752e+05 | 158641.3 | -0.9081819 | 0.3638945 |
| LABELBiscayne Plaza | 5.225887e+04 | 153182.7 | 0.3411539 | 0.7330245 |
| LABELBrentwood | -2.131171e+04 | 143583.4 | -0.1484274 | 0.8820208 |
| LABELBuena Vista Heights | -9.726231e+03 | 127560.0 | -0.0762483 | 0.9392294 |
| LABELBuena Vista West | -6.059221e+04 | 125336.3 | -0.4834372 | 0.6288396 |
| LABELCitrus Grove | -4.271731e+04 | 125380.5 | -0.3407013 | 0.7333652 |
| LABELCivic Center | -1.182140e+05 | 142517.9 | -0.8294678 | 0.4069414 |
| LABELCoral Gate | 4.552733e+03 | 126643.2 | 0.0359493 | 0.9713265 |
| LABELCurtis Park | -5.271167e+04 | 128184.4 | -0.4112174 | 0.6809584 |
| LABELDouglas Park | -2.089778e+04 | 125804.1 | -0.1661136 | 0.8680848 |
| LABELEast Little Havana | -7.344143e+04 | 128105.5 | -0.5732889 | 0.5665153 |
| LABELEdgewater | 1.360024e+05 | 144524.6 | 0.9410327 | 0.3468048 |
| LABELEdison | -6.515910e+04 | 125526.5 | -0.5190866 | 0.6037593 |
| LABELFlagami | -1.404877e+04 | 124441.0 | -0.1128950 | 0.9101254 |
| LABELFlora Park | -8.299577e+04 | 126429.9 | -0.6564566 | 0.5116079 |
| LABELGrove Center | -4.988598e+00 | 139148.4 | -0.0000359 | 0.9999714 |
| LABELHadley Park | -9.098398e+04 | 125748.7 | -0.7235381 | 0.4694363 |
| LABELHaynesworth | 2.664030e+05 | 176583.2 | 1.5086547 | 0.1315492 |
| LABELHighland Park | 1.624237e+05 | 139586.9 | 1.1636021 | 0.2447278 |
| LABELHistoric Buena Vista East | -5.887017e+04 | 131111.6 | -0.4490083 | 0.6534756 |
| LABELKing Heights | -7.867429e+04 | 127028.1 | -0.6193458 | 0.5357610 |
| LABELLa Pastorita | -2.033587e+04 | 134827.2 | -0.1508291 | 0.8801262 |
| LABELLatin Quarter | -8.384839e+04 | 138215.7 | -0.6066490 | 0.5441545 |
| LABELLe Jeune Gardens | 3.017592e+03 | 174940.9 | 0.0172492 | 0.9862396 |
| LABELLemon City/Little Haiti | -2.214532e+04 | 127569.1 | -0.1735947 | 0.8622020 |
| LABELLiberty Square | -1.348404e+05 | 129786.3 | -1.0389421 | 0.2989606 |
| LABELLittle River Central | -5.323658e+04 | 128157.4 | -0.4153999 | 0.6778950 |
| LABELLittle River Gardens | -3.572217e+04 | 150885.5 | -0.2367502 | 0.8128755 |
| LABELMelrose | 2.316001e+04 | 128219.4 | 0.1806280 | 0.8566784 |
| LABELMiami Avenue | 6.167048e+04 | 132884.1 | 0.4640923 | 0.6426335 |
| LABELMorningside | 4.408132e+04 | 128751.9 | 0.3423742 | 0.7321063 |
| LABELNorth Grapeland Heights | 1.997696e+03 | 125911.9 | 0.0158658 | 0.9873431 |
| LABELNorth Grove | 1.147757e+04 | 138338.2 | 0.0829675 | 0.9338859 |
| LABELNorth Sewell Park | -7.512163e+04 | 129458.0 | -0.5802781 | 0.5617942 |
| LABELNortheast Overtown | -6.242012e+04 | 140266.5 | -0.4450110 | 0.6563613 |
| LABELNorthwestern Estates | -7.215179e+04 | 126700.1 | -0.5694692 | 0.5691035 |
| LABELOakland Grove | -1.894625e+04 | 174008.7 | -0.1088810 | 0.9133081 |
| LABELOld San Juan | 2.561181e+04 | 127711.4 | 0.2005444 | 0.8410757 |
| LABELOrange Bowl | -1.127608e+05 | 143036.9 | -0.7883335 | 0.4305976 |
| LABELOrchard Villa | -7.445528e+04 | 126751.9 | -0.5874098 | 0.5569966 |
| LABELPalm Grove | 1.863426e+03 | 141530.0 | 0.0131663 | 0.9894965 |
| LABELParkdale North | -5.013638e+04 | 127332.1 | -0.3937450 | 0.6938124 |
| LABELParkdale South | -2.773440e+04 | 127752.2 | -0.2170953 | 0.8281568 |
| LABELRoads | -7.341516e+03 | 124568.8 | -0.0589355 | 0.9530096 |
| LABELSan Marco Island | 3.545570e+05 | 151666.4 | 2.3377418 | 0.0195014 |
| LABELSanta Clara | -5.876175e+04 | 125795.2 | -0.4671224 | 0.6404646 |
| LABELShenandoah North | -1.678461e+04 | 124462.4 | -0.1348569 | 0.8927390 |
| LABELShenandoah South | -1.636229e+04 | 125341.8 | -0.1305414 | 0.8961516 |
| LABELShorecrest | -3.823702e+04 | 125582.9 | -0.3044765 | 0.7607974 |
| LABELSilver Bluff | -1.725024e+04 | 125487.4 | -0.1374659 | 0.8906767 |
| LABELSouth Grapeland Heights | -1.889689e+04 | 126521.5 | -0.1493572 | 0.8812873 |
| LABELSouth Grove | 4.154291e+04 | 154600.8 | 0.2687108 | 0.7881807 |
| LABELSouth Grove Bayside | 2.015704e+05 | 156367.7 | 1.2890795 | 0.1975235 |
| LABELSouth Sewell Park | -4.811909e+04 | 128008.7 | -0.3759049 | 0.7070285 |
| LABELSpring Garden | 9.631367e+04 | 143613.0 | 0.6706474 | 0.5025247 |
| LABELWest Grapeland Heights | 8.373375e+03 | 128665.8 | 0.0650785 | 0.9481182 |
| LABELWest Grove | -8.904311e+04 | 127163.0 | -0.7002282 | 0.4838684 |
### 10.2.4 Generalizability in Different Groups
census <- miami.training %>% # miami.training contains all features we used including census data and geometry information
mutate(raceContext = ifelse(pctWhite > .5, "Majority White", "Majority Non-White"),
incomeContext = ifelse(MedHHInc > mean(MedHHInc,na.rm = T), "High Income", "Low Income"),
vacantContext = ifelse(pctTotalVacant > .5, "Majority Vacant", "Majority Non-Vacant"),
pctRenterContext = ifelse(pctPoverty > .5, "Majority Renter Occupied", "Majority Non-Renter Occupied")) %>%
select(raceContext,incomeContext,vacantContext,pctRenterContext)
race.group <- st_join(bothRegressions, census) %>%
group_by(Regression, raceContext) %>%
summarize(mean.MAPE = scales::percent(mean(SalePrice.APE, na.rm = T))) %>%
st_drop_geometry() %>%
spread(raceContext, mean.MAPE) %>%
kable(caption = "Table RESULT 10.3.\nTest set MAPE by Neighborhood Racial Context") %>%
kable_styling("striped", full_width = F)
race.group
| Regression | Majority Non-White | Majority White |
|---|---|---|
| Baseline Regression | 19% | 11% |
| Neighborhood Effects | 20% | 11% |
income.group <- st_join(bothRegressions, census) %>%
group_by(Regression, incomeContext) %>%
summarize(mean.MAPE = scales::percent(mean(SalePrice.APE, na.rm = T))) %>%
st_drop_geometry() %>%
spread(incomeContext, mean.MAPE) %>%
kable(caption = " RESULT 10.4.\nTest set MAPE by Neighborhood Income Context") %>%
kable_styling("striped", full_width = F)
income.group
| Regression | High Income | Low Income |
|---|---|---|
| Baseline Regression | 11% | 13% |
| Neighborhood Effects | 20% | 13% |
Eventually, we decided not to include neighborhood features in the model.
MODEL <- lm(SalePrice ~ ., data = as.data.frame(miami.training) %>%
dplyr::select(
# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
college_nn1,
school_nn3, school_nn4, school_nn5,
pctTotalVacant, MedRent, MedHHInc)
)
secret_data <- miami.test%>%
dplyr::select(Folio,
# internal features
SalePrice, Land, Bldg, Assessed, County.Taxable,
AdjustedSqFt, LotSize, Bed,Bath, ActualSqFt,
# amenity features
college_nn1,
school_nn3, school_nn4, school_nn5,
pctTotalVacant, MedRent, MedHHInc)
MedRent.no.na <- mean(secret_data$MedRent,na.rm = TRUE)
secret_data[is.na(secret_data)] <- MedRent.no.na
secret_preds <- predict(MODEL, newdata = secret_data)
output_preds <- data.frame(prediction = secret_preds, Folio = secret_data$Folio, team_name = "MySun")
write.csv(output_preds, "MySun.csv")
Is this an effective model? What were some of the more interesting variables?
How much of the variation in prices could you predict? Describe the more important features? Describe the error in your predictions?
According to your maps, could you account the spatial variation in prices?
Where did the model predict particularly well? Poorly? Why do you think this might be?
Would you recommend your model to Zillow? Why or why not? How might you improve this model?
Incorporate restaurant and bar features. Since Miami is famous for night life and party, restaurant and bar location might be relevant to the house price. However, there is no open data available for Miami restaurants.